

library(wordbankr)
help(package = "wordbankr")
ls("package:wordbankr")
## [1] "fit_aoa" "fit_vocab_quantiles" ## [3] "get_administration_data" "get_crossling_data" ## [5] "get_crossling_items" "get_instrument_data" ## [7] "get_instruments" "get_item_data" ## [9] "get_sources" "summarise_items"
get_instruments()
get_sources()
get_sources(language = "English (American)")
admins_eng_ws <- get_administration_data(language = "English (American)", form = "WS") admins_eng_ws
n_distinct(admins_eng_ws$data_id)
## [1] 5520
admins_eng_ws %>% count(age)
ggplot(admins_eng_ws, aes(x = age, y = production)) + geom_jitter(colour = "grey", size = 0.5) + geom_smooth() + labs(x = "Age (months)", y = "Productive vocabulary size")+theme_classic()
admins_russian <- get_administration_data(language = "Russian") admins_russian
admins_ws <- get_administration_data(form = "WS") admins_ws
admins <- get_administration_data() admins
nrow(admins)
## [1] 82055
admins %>% count(language, form)
items_eng_ws <- get_item_data(language = "English (American)", form = "WS") items_eng_ws
items_eng_ws %>% distinct(type)
items_eng_ws %>% distinct(category)
items_eng_ws %>% distinct(lexical_category)
items <- get_item_data() items
items %>% count(language, form)
ids <- items_eng_ws %>%
filter(definition %in% c("dog", "cat")) %>%
pull(item_id)
get_instrument_data(language = "English (American)", form = "WS", items = ids)
get_instrument_data(language = "English (American)", form = "WS",
items = ids, administrations = TRUE, iteminfo = TRUE)
twos <- admins_eng_ws %>% filter(age == 24)
dog_cat <- items_eng_ws %>% filter(definition %in% c("dog", "cat"))
get_instrument_data(language = "English (American)", form = "WS", items = dog_cat$item_id,
administrations = twos, iteminfo = dog_cat)
Exercises
Compute and plot median productive vocabulary size (as proportion of total words) over age in each language. Limit to WS data for children 16-30 months old (hint: left_join and facet_wrap are likely to be helpful).
For English WS data, compute and plot the proportion of children that produce each word in the “toys” category at each age.
(Bonus: Do the same thing as in 2 but separately for girls and boys.)
Exercise 1
items
Exercise 1
items %>% filter(form == "WS", type == "word")
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language)
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n())
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n()) -> num_words
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n()) -> num_words
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n()) -> num_words admins
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n()) -> num_words admins %>% filter(form == "WS", age >= 16, age <= 30)
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n()) -> num_words admins %>% filter(form == "WS", age >= 16, age <= 30) %>% left_join(num_words)
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n()) -> num_words admins %>% filter(form == "WS", age >= 16, age <= 30) %>% left_join(num_words) %>% mutate(prop_vocab = production / words)
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n()) -> num_words admins %>% filter(form == "WS", age >= 16, age <= 30) %>% left_join(num_words) %>% mutate(prop_vocab = production / words) %>% group_by(language, age)
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n()) -> num_words admins %>% filter(form == "WS", age >= 16, age <= 30) %>% left_join(num_words) %>% mutate(prop_vocab = production / words) %>% group_by(language, age) %>% summarise(median_vocab = median(prop_vocab))
Exercise 1
items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n()) -> num_words admins %>% filter(form == "WS", age >= 16, age <= 30) %>% left_join(num_words) %>% mutate(prop_vocab = production / words) %>% group_by(language, age) %>% summarise(median_vocab = median(prop_vocab)) -> vocab_summary
Exercise 1
ggplot(vocab_summary, aes(x = age, y = median_vocab))
Exercise 1
ggplot(vocab_summary, aes(x = age, y = median_vocab)) + facet_wrap(~language, ncol = 7)
Exercise 1
ggplot(vocab_summary, aes(x = age, y = median_vocab)) + facet_wrap(~language, ncol = 7) + geom_point(size = 0.6)
Exercise 1
ggplot(vocab_summary, aes(x = age, y = median_vocab)) + facet_wrap(~language, ncol = 7) + geom_point(size = 0.6) + ylim(0, 1)
Exercise 1
ggplot(vocab_summary, aes(x = age, y = median_vocab)) + facet_wrap(~language, ncol = 7) + geom_point(size = 0.6) + ylim(0, 1) + labs(x = "Age (months)", y = "Productive vocabulary size")
Exercise 1
ggplot(vocab_summary, aes(x = age, y = median_vocab)) + facet_wrap(~language, ncol = 7) + geom_point(size = 0.6) + ylim(0, 1) + labs(x = "Age (months)", y = "Productive vocabulary size") + theme(strip.text = element_text(size = rel(0.5)))+theme_classic()
— {.build}
Exercise 2
toys <- items_eng_ws %>% filter(type == "word", category == "toys")
toys_data <- get_instrument_data(language = "English (American)", form = "WS",
items = toys$item_id,
administrations = admins_eng_ws,
iteminfo = toys) %>%
mutate(produces = !is.na(value) & value == "produces")
Exercise 2
toys_data
Exercise 2
toys_data %>% group_by(definition, age)
Exercise 2
toys_data %>% group_by(definition, age) %>% summarise(prop_produces = sum(produces) / n())
Exercise 2
toys_data %>% group_by(definition, age) %>% summarise(prop_produces = sum(produces) / n()) -> toys_summary
Exercise 2
ggplot(toys_summary, aes(x = age, y = prop_produces))
Exercise 2
ggplot(toys_summary, aes(x = age, y = prop_produces)) + facet_wrap(~definition, ncol = 6)
Exercise 2
ggplot(toys_summary, aes(x = age, y = prop_produces)) + facet_wrap(~definition, ncol = 6) + geom_smooth()
Exercise 2
ggplot(toys_summary, aes(x = age, y = prop_produces)) + facet_wrap(~definition, ncol = 6) + geom_smooth() + labs(x = "Age (months)", y = "Proportion of children producing")+theme_classic()
Exercise 3
toys_data
Exercise 3
toys_data %>% filter(!is.na(sex))
Exercise 3
toys_data %>% filter(!is.na(sex)) %>% group_by(definition, age, sex)
Exercise 3
toys_data %>% filter(!is.na(sex)) %>% group_by(definition, age, sex) %>% summarise(prop_produces = sum(produces) / n())
Exercise 3
toys_data %>% filter(!is.na(sex)) %>% group_by(definition, age, sex) %>% summarise(prop_produces = sum(produces) / n()) -> toys_summary_sex
Exercise 3
ggplot(toys_summary_sex, aes(x = age, y = prop_produces))
Exercise 3
ggplot(toys_summary_sex, aes(x = age, y = prop_produces)) + facet_wrap(~definition, ncol = 6)
Exercise 3
ggplot(toys_summary_sex, aes(x = age, y = prop_produces)) + facet_wrap(~definition, ncol = 6) + geom_smooth(aes(colour = sex), se = FALSE)
Exercise 3
ggplot(toys_summary_sex, aes(x = age, y = prop_produces)) + facet_wrap(~definition, ncol = 6) + geom_smooth(aes(colour = sex), se = FALSE) + scale_colour_ptol(name = "")
Exercise 3
ggplot(toys_summary_sex, aes(x = age, y = prop_produces)) + facet_wrap(~definition, ncol = 6) + geom_smooth(aes(colour = sex), se = FALSE) + scale_colour_ptol(name = "") + labs(x = "Age (months)", y = "Proportion of children producing")+theme_classic()
fit_aoa(toys_data) %>% select(definition, aoa)
fit_aoa(toys_data, method = "glmrob", proportion = 0.8) %>% select(definition, aoa)
get_crossling_items()
get_crossling_data(uni_lemmas = "dog")
Use data from Wordbank/childes-db to explore a question about language learning. Some ideas:
Wordbank
Wordbank
– wordbank.stanford.edu
– github.com/langcog/wordbankr
– langcog.github.io/wordbankr
– mb-cdi.stanford.edu
Citation: Frank, M. C., Braginsky, M., Yurovsky, D., & Marchman, V. A. (2017). Wordbank: An open repository for developmental vocabulary data. Journal of Child Language, 44(3), 677-694.
childes-db
– childes-db.stanford.edu/
– github.com/langcog/childesr
– childes.talkbank.org
Citation: Sanchez, A., Meylan, S. C., Braginsky, M., MacDonald, K. E., Yurovsky, D., & Frank, M. C. (2019). childes-db: A flexible and reproducible interface to the Child Language Data Exchange System. Behavior Research Methods, 1-14.
This presentation
– github.com/mikabr/acq-tools
– mikabr.github.io/acq-tools
Contact: mikabr@mit.edu, mcfrank@stanford.edu